from math import log
import operator

def calcShannonEnt(dataSet):  # 计算数据的熵(entropy)
    numEntries = len(dataSet)  # 求出数据有多少个对象，即有多少行,计算实例总数
    labelCounts = {}
    for featVec in dataSet:  # 对数据集逐行求最后一类的数据，并将统计最后一列数据的数目
        currentLabel = featVec[-1]  # 每行数据的最后一个字（类别）
        if currentLabel not in labelCounts.keys():  # 创建一个字典，键值是最后一列的数值
            labelCounts[currentLabel] = 0  # 当前键值不存在，则扩展字典并将此键值加入字典
        labelCounts[currentLabel] += 1  # 每个键值都记录了当前类别出现的次数
    shannonEnt = 0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries  # 计算单个类的熵值
        shannonEnt -= prob*log(prob, 2)  # 累加每个类的熵值
    return shannonEnt

def createDataSet1():    # 创造示例数据
    dataSet = [['长', '粗', '男'],
               ['短', '粗', '男'],
               ['短', '粗', '男'],
               ['长', '细', '女'],
               ['短', '细', '女'],
               ['短', '粗', '女'],
               ['长', '粗', '女'],
               ['长', '粗', '女']]
    labels = ['头发', '声音']   # 两个特征
    return dataSet, labels

def splitDataSet(dataSet,axis,value):  # 按某个特征分类后的数据
    retDataSet=[]
    for featVec in dataSet:
        if featVec[axis] == value:  # 判断axis列的值是否为value
            reducedFeatVec = featVec[:axis]  # [:axis]表示前axis行，即若axis为2，就是取featVec的前axis行
            reducedFeatVec.extend(featVec[axis+1:])  # [axis+1:]表示从跳过axis+1行，取接下来的数据
            retDataSet.append(reducedFeatVec)
    return retDataSet

def chooseBestFeatureToSplit(dataSet):  # 选择最优的分类特征
    numFeatures = len(dataSet[0])-1
    baseEntropy = calcShannonEnt(dataSet)  # 原始的熵
    bestInfoGain = 0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)  # 创建了一个列表，里面的元素是dataSet所有的元素，不重复
        newEntropy = 0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)
            prob =len(subDataSet)/float(len(dataSet))
            newEntropy +=prob*calcShannonEnt(subDataSet)  # 计算按每个数据特征来划分的数据集的熵
        infoGain = baseEntropy - newEntropy  # 原始熵与按特征分类后的熵的差值
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i  # 判断出哪种划分方式得到最大的信息增益，且得出该划分方式所选择的特征
    return bestFeature

def majorityCnt(classList):    #按分类后类别数量排序，比如：最后分类为2男1女，则判定为男；
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote]+=1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]  # 类别：男或女
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)  # 选择最优特征
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel: {}}  # 分类结果以字典形式保存
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
    return myTree


dataSet, labels = createDataSet1()  # 创造示列数据
print(createTree(dataSet, labels))  # 输出决策树模型结果
